
##################################################
#REPLICATION FILE FOR WAIT AND SEE (TEXT ANALYSIS)
##################################################

################################################
#OUTLINE:
# SECTION 1: PRELIMINARY TIME SERIES AND TOKENIZING, WORDCLOUDS
# SECTION 2: WEEKLY WORD COUNT TIME SERIES GRAPHS
# SECTION 3: SENTIMENT ANALYSIS AND GRAPHS
# SECTION 4: LOG ODDS CALCULATIONS AND VISUALISATIONS


# WARNING: SOME PARTS OF THE CODE TAKE A WHILE TO RUN (even more than 10-15 mins).
# Recommendation for replication: run line by line


################################################## 
##### INSTALL AND LOAD NECESSARY PACKAGES ##### 
# Note for replication: Make sure to install the following packages before running.

rm(list=ls())
#install.packages("readtext")
library(dplyr)
library(roperators)
library(lubridate)
library(ggplot2)
library(gtools)
library(tidytext)
library(wordcloud)
library(tidyr)
library(tidyverse)
library(stringr)
library(scales)
library(reshape2)
library(magrittr)

################################################## 
# Load the data containing the text from newspapers during the sample time period
load(file='data_R.RData')


#LOCATIONS: change to your own
setwd("/")
#################################################################################################### 



####################################################################################################  
# SECTION 1: PRELIMINARY TIME SERIES AND TOKENIZING
####################################################################################################  

####################################################################################################  
#PART A. VISUALISE TIMESERIES

data %<>%
  group_by(date) %>%
  mutate(TotalDaily = n_distinct(X))

data %>% 
  filter(date < ymd(20051124)) %>% 
  filter(date > ymd(20050530)) %>% 
  ggplot(aes(x=date, y=TotalDaily)) +
  scale_x_date(date_breaks = "months", , date_labels = "%b-%y")+
  geom_line() +
  theme_bw() +
  theme(panel.border=element_blank()) +
  geom_vline(xintercept = ymd(20050707), color="red") +
  geom_vline(xintercept = ymd(20050715), color="red") +
  geom_vline(xintercept = ymd(20050806), color="red") +
  labs(y = "Daily Newspaper Articles", x="Date")
ggsave("DailyArticles.png", width=9, height=6)


data %>% 
  filter(date < ymd(20051124)) %>% 
  filter(date > ymd(20050530)) %>% 
  ggplot(aes(x=date, y=TotalDaily)) +
  scale_x_date(date_breaks = "months", , date_labels = "%b-%y")+
  geom_line() +
  theme_bw() +
  theme(panel.border=element_blank()) +
  geom_vline(xintercept = ymd(20050707), color="red") +
  geom_vline(xintercept = ymd(20050715), color="red") +
  geom_vline(xintercept = ymd(20050721), color="blue") +
  geom_vline(xintercept = ymd(20050806), color="red") +
  labs(y = "Daily Newspaper Articles", x="Date")
ggsave("DailyArticles_Failed.png", width=9, height=6)

###
data %<>%
  mutate(CategoryPublication=case_when(
    Publication %in% c("The Express", "The Mirror", "Daily Star Sunday", "DAILY MAIL (London)", "The People", "Morning Star", "The Sun" ) ~ "Tabloid",
    Publication %in% c("The Guardian (London)", "Guardian.com", "The Independent (London)") ~ "Guardian & Independent",
    Publication %in% c("The Times (London)","The Sunday Times (London)", "THE DAILY TELEGRAPH(LONDON)")~ "Telegraph & Times"
  ))

table(data$CategoryPublication)


data %<>%
  group_by(date, CategoryPublication) %>%
  mutate(TotalPublicationDaily = n_distinct(X))

data %<>%
  group_by(week, CategoryPublication) %>%
  mutate(TotalPublicationWeekly = n_distinct(X))

data %>% 
  filter(date < ymd(20051124)) %>% 
  filter(date > ymd(20050530)) %>% 
  filter(CategoryPublication!="") %>% 
  ggplot(aes(x=week, y=TotalPublicationWeekly, colour=CategoryPublication)) +
  geom_line() +
  theme_bw() +
  geom_vline(xintercept = week(ymd(20050706)), color="red") +
  geom_vline(xintercept = week(ymd(20050714)), color="red") +
  geom_vline(xintercept = week(ymd(20050805)), color="red") +
  theme(panel.border=element_blank()) +
  labs(y = "Weekly Newspaper Articles", x="Week")
ggsave("WeeklyByPublication.png", width=9, height=6)



####################################################################################################  
#PART B. CLEAN THE TEXT AND SPLIT TO CREATE WORD TOKENS, COMPUTE WORD COUNTS AND VISUALIZE

data$Text <- as.character(data$Text)
data$Publication <- as.character(data$Publication)

data_later<- data %>% 
  filter(date < ymd(20050806)) %>% 
  filter(date > ymd(20050714)) 
summary(data_later$date)

data_now<- data %>% 
  filter(date < ymd(20050715)) %>% 
  filter(date > ymd(20050706)) 
summary(data_now$date)

#create tokens, tidytext
tidy_text<- data %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words)

tidy_data_later<- data_later %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words)

tidy_data_now<- data_now %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words)

summary(tidy_data_later$week)
summary(tidy_data_now$week)

tidy_text %>%
  count(word, sort = TRUE) 

tidy_data_now %>%
  count(word, sort = TRUE) 

tidy_data_later %>%
  count(word, sort = TRUE) 

####################################################################################################  
#WORDCLOUDS (note for replication: word clouds slightly vary by position of word so created also bar graph frequencies)
####################################################################################################  

tidy_data_now=subset(tidy_data_now, select=-CategoryPublication)
tidy_data_now=subset(tidy_data_now, select=-week)

tidy_data_later=subset(tidy_data_later, select=-CategoryPublication)
tidy_data_later=subset(tidy_data_later, select=-week)

  
tidy_data_now %>%
  count(word, sort = TRUE) %>%
  filter(n > 1000) %>%
  mutate(word = reorder(word,n)) %>%
  ggplot(aes(x=n, y=word)) +
  geom_col() +
  labs(y = NULL)
ggsave("FigureH4a.png", width=8, height=6)


tidy_data_later %>%
  count(word, sort = TRUE) %>%
  filter(n > 1200) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(n, word)) +
  geom_col() +
  labs(y = NULL)
ggsave("FigureH4b.png", width=8, height=6)

set.seed(10001001)

# For replication, click export as PNG and save
tidy_data_now %>%
  count(word, sort = TRUE) %>%
  with(wordcloud(word, n, max.words = 30))
dev.print (png,'WordCloud_FirstWeek.png', width=800, height=900, res=200)
dev.off()

# For replication, click export as PNG and save
tidy_data_later %>%
  count(word, sort = TRUE) %>%
  with(wordcloud(word, n, max.words = 60))
dev.print (png,'WordCloud_MonthLater.png', width=800, height=900, res=200)
dev.off()

####################################################################################################  
#COMPUTE WORD COUNTS

data$terrorlaw<- str_count(data$Text, regex(pattern=c("terror law"), ignore_case = TRUE))

data$counter1<- str_count(data$Text, regex(pattern=c("counterterrorism"), ignore_case = TRUE))
data$counter2<- str_count(data$Text, regex(pattern=c("counter terror"), ignore_case = TRUE))
data$counter<-data$counter1+data$counter2

data$tap<- str_count(data$Text, regex(pattern=c("tap phone"), ignore_case = TRUE))

data$idcard1<- str_count(data$Text, regex(pattern=c("identity card"),  ignore_case = TRUE))
data$idcard2<- str_count(data$Text, regex(pattern=c("ID card"),  ignore_case = TRUE))

data$idcard<-data$idcard1+data$idcard2

data$ban<- str_count(data$Text, regex(pattern=c("ban"), ignore_case = TRUE))

data$detain1<- str_count(data$Text, regex(pattern=c("detain"), ignore_case = TRUE))
data$detain2<- str_count(data$Text, regex(pattern=c("tortur"), ignore_case = TRUE))
data$detain<-data$detain1+data$detain2

data$security<- str_count(data$Text, c("security"))
data$freedom<- str_count(data$Text, c("freedom"))
data$muslim1<- str_count(data$Text, regex(pattern=c("Muslim"), ignore_case = TRUE) )
data$muslim2<- str_count(data$Text, regex(pattern=c("islam"), ignore_case = TRUE))
data$muslim<-data$muslim1+data$muslim2


data$solidarity<- str_count(data$Text, regex(pattern=c("solidarity"), ignore_case = TRUE) )

data$intelligence<- str_count(data$Text, regex(pattern=c("intelligence"), ignore_case = TRUE) )

data$bill1<- str_count(data$Text, regex(pattern=c("terrorism bill"), ignore_case = TRUE))
data$bill2<- str_count(data$Text, regex(pattern=c("terrorism act"), ignore_case = TRUE))

data$bill<- data$bill1+data$bill2


data %<>%
  mutate(weeklabel=case_when(
    week %in% c(22) ~ "30th May",
    week %in% c(23 ) ~ "6th June",
    week %in% c(24 ) ~ "13th June",
    week %in% c(25 ) ~ "20th June",
    week %in% c(26 ) ~ "27th June",
    week %in% c(27 ) ~ "4th July",
  ))


####################################################################################################  
# SECTION 2: WEEKLY WORD COUNT TIME SERIES GRAPHS
####################################################################################################  


#Weekly Terror Law
data %>% 
  filter(date < ymd(20051124)) %>% 
  filter(date > ymd(20050530)) %>% 
  group_by(week) %>%
  mutate(WeeklyTotalLaw = mean(terrorlaw)) %>% 
  ggplot(aes(x=week, y=WeeklyTotalLaw)) +
  geom_line() +
  theme_bw() +
  theme(panel.border=element_blank(), axis.text.x  = element_text(angle=90, size=8)) +
  scale_x_continuous(
    breaks = c(22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46),
    label = c("30th May", "13th June", "27th June", "11th July", "25th July", "8th August", "22th August",
              "5th Sep", "19th Sep", "30th October", "17th October", "31st Oct", "14th Nov")) +
  geom_vline(xintercept = 27, color="red") +
  geom_text(aes(x=27.5, label="Immediate run", y=0.06), colour="red", angle=90, size=2.5) +
  geom_vline(xintercept = 28, color="red") +
  geom_text(aes(x=29, label="Short run", y=0.06), colour="red", angle=90, size=3.5) +
  geom_vline(xintercept = 31, color="red") +
  labs(y = "Mentions of 'Terror Law'", x="Week")
ggsave("WeeklyTerror.png", width=8, height=6)

#Weekly Counter
data %>% 
  filter(date < ymd(20051124)) %>% 
  filter(date > ymd(20050530)) %>% 
  group_by(week) %>%
  mutate(WeekTotalCounter = mean(counter)) %>% 
  ggplot(aes(x=week, y=WeekTotalCounter)) +
  geom_line() +
  theme_bw() +
  theme(panel.border=element_blank(), axis.text.x  = element_text(angle=90, size=8)) +
  scale_x_continuous(
    breaks = c(22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46),
    label = c("30th May", "13th June", "27th June", "11th July", "25th July", "8th August", "22th August",
              "5th Sep", "19th Sep", "30th October", "17th October", "31st Oct", "14th Nov")) +
  geom_vline(xintercept = 27, color="red") +
  geom_text(aes(x=27.5, label="Immediate run", y=0.02), colour="red", angle=90, size=2.5) +
  geom_vline(xintercept = 28, color="red") +
  geom_text(aes(x=29, label="Short run", y=0.02), colour="red", angle=90, size=3.5) +
  geom_vline(xintercept = 31, color="red") +
  labs(y = "Mentions of 'Counterterrorism'", x="Week")
ggsave("WeeklyCounter.png", width=8, height=6)


#Weekly tap
data %>% 
  filter(date < ymd(20051124)) %>% 
  filter(date > ymd(20050530)) %>% 
  group_by(week) %>%
  mutate(WeekTotalTap = mean(tap)) %>% 
  ggplot(aes(x=week, y=WeekTotalTap)) +
  geom_line() + theme_bw() +
  theme(panel.border=element_blank(), axis.text.x  = element_text(angle=90, size=8)) +
  scale_x_continuous(
    breaks = c(22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46),
    label = c("30th May", "13th June", "27th June", "11th July", "25th July", "8th August", "22th August",
              "5th Sep", "19th Sep", "30th October", "17th October", "31st Oct", "14th Nov")) +
  geom_vline(xintercept = 27, color="red") +
  geom_text(aes(x=27.5, label="Immediate run", y=0.015), colour="red", angle=90, size=2.5) +
  geom_vline(xintercept = 28, color="red") +
  geom_text(aes(x=29, label="Short run", y=0.015), colour="red", angle=90, size=3.5) +
  geom_vline(xintercept = 31, color="red") +
  labs(y = "Mentions of 'Tapping Phones'", x="Week")
ggsave("WeeklyTap.png", width=8, height=6)

#Weekly ID
data %>% 
  filter(date < ymd(20051124)) %>% 
  filter(date > ymd(20050530)) %>% 
  group_by(week) %>%
  mutate(WeekTotalId = mean(idcard)) %>% 
  ggplot(aes(x=week, y=WeekTotalId)) +
  geom_line() + theme_bw() +
  theme(panel.border=element_blank(), axis.text.x  = element_text(angle=90, size=8)) +
  scale_x_continuous(
    breaks = c(22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46),
    label = c("30th May", "13th June", "27th June", "11th July", "25th July", "8th August", "22th August",
              "5th Sep", "19th Sep", "30th October", "17th October", "31st Oct", "14th Nov")) +
  geom_vline(xintercept = 27, color="red") +
  geom_text(aes(x=27.5, label="Immediate run", y=0.35), colour="red", angle=90, size=2.5) +
  geom_vline(xintercept = 28, color="red") +
  geom_text(aes(x=29, label="Short run", y=0.35), colour="red", angle=90, size=3.5) +
  geom_vline(xintercept = 31, color="red") +
  labs(y = "Mentions of 'Id cards'", x="Week")
ggsave("WeeklyID.png", width=8, height=6)

#Weekly ban
data %>% 
  filter(date < ymd(20051124)) %>% 
  filter(date > ymd(20050530)) %>% 
  group_by(week) %>%
  mutate(WeekTotalBan = mean(ban)) %>% 
  ggplot(aes(x=week, y=WeekTotalBan)) +
  geom_line() + theme_bw() +
  theme(panel.border=element_blank(), axis.text.x  = element_text(angle=90, size=8)) +
  scale_x_continuous(
    breaks = c(22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46),
    label = c("30th May", "13th June", "27th June", "11th July", "25th July", "8th August", "22th August",
              "5th Sep", "19th Sep", "30th October", "17th October", "31st Oct", "14th Nov")) +
  geom_vline(xintercept = 27, color="red") +
  geom_text(aes(x=27.5, label="Immediate run", y=1), colour="red", angle=90, size=2.5) +
  geom_vline(xintercept = 28, color="red") +
  geom_text(aes(x=29, label="Short run", y=1), colour="red", angle=90, size=3.5) +
  geom_vline(xintercept = 31, color="red") +
  labs(y = "Mentions of 'Ban'", x="Week")
ggsave("Weeklyban.png", width=8, height=6)

#Weekly detain
data %>% 
  filter(date < ymd(20051124)) %>% 
  filter(date > ymd(20050530)) %>% 
  group_by(week) %>%
  mutate(WeekTotalDetain = mean(detain)) %>% 
  ggplot(aes(x=week, y=WeekTotalDetain)) +
  geom_line() + theme_bw() +
  theme(panel.border=element_blank(), axis.text.x  = element_text(angle=90, size=8)) +
  scale_x_continuous(
    breaks = c(22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46),
    label = c("30th May", "13th June", "27th June", "11th July", "25th July", "8th August", "22th August",
              "5th Sep", "19th Sep", "30th October", "17th October", "31st Oct", "14th Nov")) +
  geom_vline(xintercept = 27, color="red") +
  geom_text(aes(x=27.5, label="Immediate run", y=0.5), colour="red", angle=90, size=2.5) +
  geom_vline(xintercept = 28, color="red") +
  geom_text(aes(x=29, label="Short run", y=0.5), colour="red", angle=90, size=3.5) +
  geom_vline(xintercept = 31, color="red") +
  labs(y = "Mentions of 'Detain, Torture'", x="Week")
ggsave("WeeklyDetain.png", width=8, height=6)

#Weekly security
data %>% 
  filter(date < ymd(20051124)) %>% 
  filter(date > ymd(20050530)) %>% 
  group_by(week) %>%
  mutate(WeekTotalSecurity = mean(security)) %>% 
  ggplot(aes(x=week, y=WeekTotalSecurity)) +
  geom_line() + theme_bw() +
  theme(panel.border=element_blank(), axis.text.x  = element_text(angle=90, size=8)) +
  scale_x_continuous(
    breaks = c(22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46),
    label = c("30th May", "13th June", "27th June", "11th July", "25th July", "8th August", "22th August",
              "5th Sep", "19th Sep", "30th October", "17th October", "31st Oct", "14th Nov")) +
  geom_vline(xintercept = 27, color="red") +
  geom_text(aes(x=27.5, label="Immediate run", y=0.35), colour="red", angle=90, size=2.5) +
  geom_vline(xintercept = 28, color="red") +
  geom_text(aes(x=29, label="Short run", y=0.35), colour="red", angle=90, size=3.5) +
  geom_vline(xintercept = 31, color="red") +
  labs(y = "Mentions of 'Security'", x="Week")
ggsave("WeeklySecurity.png", width=8, height=6)

#Weekly freedom
data %>% 
  filter(date < ymd(20051124)) %>% 
  filter(date > ymd(20050530)) %>% 
  group_by(week) %>%
  mutate(WeekTotalFreedom = mean(freedom)) %>% 
  ggplot(aes(x=week, y=WeekTotalFreedom)) +
  geom_line() + theme_bw() +
  theme(panel.border=element_blank(), axis.text.x  = element_text(angle=90, size=8)) +
  scale_x_continuous(
    breaks = c(22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46),
    label = c("30th May", "13th June", "27th June", "11th July", "25th July", "8th August", "22th August",
              "5th Sep", "19th Sep", "30th October", "17th October", "31st Oct", "14th Nov")) +
  geom_vline(xintercept = 27, color="red") +
  geom_text(aes(x=27.5, label="Immediate run", y=0.12), colour="red", angle=90, size=2.5) +
  geom_vline(xintercept = 28, color="red") +
  geom_text(aes(x=29, label="Short run", y=0.12), colour="red", angle=90, size=3.5) +
  geom_vline(xintercept = 31, color="red") +
  labs(y = "Mentions of 'Freedom'", x="Week")
ggsave("WeeklyWeekTotalFreedom.png", width=8, height=6)


#Weekly muslim
data %>% 
  filter(date < ymd(20051124)) %>% 
  filter(date > ymd(20050530)) %>% 
  group_by(week) %>%
  mutate(WeekTotalMuslim = mean(muslim)) %>% 
  ggplot(aes(x=week, y=WeekTotalMuslim)) +
  geom_line() + theme_bw() +
  theme(panel.border=element_blank(), axis.text.x  = element_text(angle=90, size=8)) +
  scale_x_continuous(
    breaks = c(22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46),
    label = c("30th May", "13th June", "27th June", "11th July", "25th July", "8th August", "22th August",
              "5th Sep", "19th Sep", "30th October", "17th October", "31st Oct", "14th Nov")) +
  geom_vline(xintercept = 27, color="red") +
  geom_text(aes(x=27.5, label="Immediate run", y=0.4), colour="red", angle=90, size=2.5) +
  geom_vline(xintercept = 28, color="red") +
  geom_text(aes(x=29, label="Short run", y=0.4), colour="red", angle=90, size=3.5) +
  geom_vline(xintercept = 31, color="red") +
  labs(y = "Mentions of 'Muslim, Islam'", x="Week")
ggsave("WeeklyTotalMuslim.png", width=8, height=6)


#Weekly solidarity
data %>% 
  filter(date < ymd(20051124)) %>% 
  filter(date > ymd(20050530)) %>% 
  group_by(week) %>%
  mutate(WeekTotalSolidarity = mean(solidarity)) %>% 
  ggplot(aes(x=week, y=WeekTotalSolidarity)) +
  geom_line() + theme_bw() +
  theme(panel.border=element_blank(), axis.text.x  = element_text(angle=90, size=8)) +
  scale_x_continuous(
    breaks = c(22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46),
    label = c("30th May", "13th June", "27th June", "11th July", "25th July", "8th August", "22th August",
              "5th Sep", "19th Sep", "30th October", "17th October", "31st Oct", "14th Nov")) +
  geom_vline(xintercept = 27, color="red") +
  geom_text(aes(x=27.5, label="Immediate run", y=0.08), colour="red", angle=90, size=2.5) +
  geom_vline(xintercept = 28, color="red") +
  geom_text(aes(x=29, label="Short run", y=0.08), colour="red", angle=90, size=3.5) +
  geom_vline(xintercept = 31, color="red") +
  labs(y = "Mentions of 'Solidarity'", x="Week")
ggsave("WeeklySolidarity.png", width=8, height=6)

#Weekly intelligence
data %>% 
  filter(date < ymd(20051124)) %>% 
  filter(date > ymd(20050530)) %>% 
  group_by(week) %>%
  mutate(WeekTotalIntelligence = mean(intelligence)) %>% 
  ggplot(aes(x=week, y=WeekTotalIntelligence)) +
  geom_line() + theme_bw() +
  theme(panel.border=element_blank(), axis.text.x  = element_text(angle=90, size=8)) +
  scale_x_continuous(
    breaks = c(22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46),
    label = c("30th May", "13th June", "27th June", "11th July", "25th July", "8th August", "22th August",
              "5th Sep", "19th Sep", "30th October", "17th October", "31st Oct", "14th Nov")) +
  geom_vline(xintercept = 27, color="red") +
  geom_text(aes(x=27.5, label="Immediate run", y=0.10), colour="red", angle=90, size=2.5) +
  geom_vline(xintercept = 28, color="red") +
  geom_text(aes(x=29, label="Short run", y=0.08), colour="red", angle=90, size=3.5) +
  geom_vline(xintercept = 31, color="red") +
  labs(y = "Mentions of 'Intelligence'", x="Week")
ggsave("WeeklyIntelligence.png", width=8, height=6)

#Weekly bill
data %>% 
  filter(date < ymd(20051124)) %>% 
  filter(date > ymd(20050530)) %>% 
  group_by(week) %>%
  mutate(WeekTotalBill = mean(bill)) %>% 
  ggplot(aes(x=week, y=WeekTotalBill)) +
  geom_line() + theme_bw() +
  theme(panel.border=element_blank(), axis.text.x  = element_text(angle=90, size=8)) +
  scale_x_continuous(
    breaks = c(22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46),
    label = c("30th May", "13th June", "27th June", "11th July", "25th July", "8th August", "22th August",
              "5th Sep", "19th Sep", "30th October", "17th October", "31st Oct", "14th Nov")) +
  geom_vline(xintercept = 27, color="red") +
  geom_text(aes(x=27.5, label="Immediate run", y=0.08), colour="red", angle=90, size=2.5) +
  geom_vline(xintercept = 28, color="red") +
  geom_text(aes(x=29, label="Short run", y=0.08), colour="red", angle=90, size=3.5) +
  geom_vline(xintercept = 31, color="red") +
  labs(y = "Mentions of 'Terrorism Act/Bill'", x="Week")
ggsave("WeeklyBill.png", width=8, height=6)



####################################################################################################  
# SECTION 3: SENTIMENT ANALYSIS
####################################################################################################  

#positive, negative
bingnegative <- get_sentiments("bing") %>% 
  filter(sentiment == "negative")
bing <- get_sentiments("bing")

bing_word_counts <- tidy_text %>%
  inner_join(bing) %>%
  count(word, sentiment, sort = TRUE) %>%
  ungroup()


bing_word_counts %>%
  filter(n > 1500) %>%
  mutate(n = ifelse(sentiment == "negative", -n, n)) %>%
  mutate(word = reorder(word, n)) %>%
  ggplot(aes(word, n, fill = sentiment)) +
  geom_bar(stat = "identity") +
  theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
  ylab("Contribution to sentiment")



###EMOTIONS 

sentiment_nrc <- tidy_text %>% 
  inner_join(get_sentiments("nrc")) 

text_full <- full_join(tidy_text, sentiment_nrc) 

text_full$ID <- seq.int(nrow(text_full))

text_full$week <- week(text_full$date)

text_full %<>%
  group_by(date, sentiment) %>%
  mutate(DailySentimentCount=n_distinct(ID))

text_full %<>%
  group_by(date) %>%
  mutate(DailySentiment=n_distinct(ID))

text_full$DailySentimentMean<- text_full$DailySentimentCount/text_full$DailySentiment


text_full %<>%
  group_by(week, sentiment) %>%
  mutate(WeeklySentimentCount=n_distinct(ID))

text_full %<>%
  group_by(week) %>%
  mutate(WeeklySentiment=n_distinct(ID))

text_full$WeeklySentimentMean<- text_full$WeeklySentimentCount/text_full$WeeklySentiment


####################################################################################################  
# VISUALISATION: SENTIMENT GRAPHS

text_full %>%   
  filter(week > 21) %>% 
  filter(week < 48) %>% 
  filter(sentiment!="") %>% 
  ggplot(aes(x=week, y=WeeklySentimentMean, colour=sentiment)) +
  geom_line() + theme_bw() +
  theme(panel.border=element_blank()) +
  scale_x_continuous(breaks = scales::pretty_breaks(n = 10)) +
  geom_vline(xintercept = 27, color="red") +
  geom_vline(xintercept = 28, color="red") +
  geom_vline(xintercept = 31, color="red") +
  labs(y = "Weekly Sentiment Mean", x="Week")
ggsave("WeeklyMeanSentiment.png", width=9, height=6)

countsentiment<-text_full %>% 
  filter(week > 21) %>% 
  filter(week < 48) %>% 
  filter(sentiment!="") %>% 
  ggplot(aes(x=week, y=WeeklySentimentCount, colour=sentiment)) +
  geom_line() + theme_bw() 

countsentiment<-countsentiment+
theme(panel.border=element_blank(), axis.text.x  = element_text(angle=90, size=8)) +
  scale_x_continuous(
    breaks = c(22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44, 46),
    label = c("30th May", "13th June", "27th June", "11th July", "25th July", "8th August", "22th August",
              "5th Sep", "19th Sep", "30th October", "17th October", "31st Oct", "14th Nov")) +
  geom_vline(xintercept = 27, color="red") +
  geom_text(aes(x=27.5, label="Immediate run", y=0.08), colour="red", angle=90, size=2.5) +
  geom_vline(xintercept = 28, color="red") +
  geom_text(aes(x=29, label="Short run", y=0.08), colour="red", angle=90, size=3.5) +
  geom_vline(xintercept = 31, color="red") +
  labs(y = "Weekly Sentiment Count", x="Week")
ggsave("WeeklyCountSentiment.png", width=9, height=6)



####################################################################################################  
# SECTION 4: LOG ODDS CALCULATIONS AND VISUALISATIONS
####################################################################################################  


#the first month
data_later=subset(data_later, select=-week)
data_later=subset(data_later, select=-CategoryPublication)

word_later <- data_later %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words) %>%
  count(word, sort = TRUE)

word_later %<>% 
  mutate(total = sum(n))

freq_by_rank_short <- word_later %>% 
  mutate(rank = row_number(), `short term frequency` = n/total) %>% 
  filter(rank<50)

freq_by_rank_short_100 <- word_later %>% 
  mutate(rank = row_number(), `short term frequency` = n/total) %>% 
  filter(rank<100)


#the first week
data_now=subset(data_now, select=-week)
data_now=subset(data_now, select=-CategoryPublication)

word_now <- data_now %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words) %>%
  count(word, sort = TRUE)

word_now %<>% 
  mutate(total = sum(n))

freq_by_rank_veryshort <- word_now %>% 
  mutate(rank = row_number(), `Immediate term frequency` = n/total)  %>% 
  filter(rank<50)

freq_by_rank_veryshort_100 <- word_now %>% 
  mutate(rank = row_number(), `Immediate term frequency` = n/total)  %>% 
  filter(rank<100)

#JOIN 50
DF1 <- freq_by_rank_short %>%
  select(word, `short term frequency` )

DF2 <- freq_by_rank_veryshort %>%
  select(word, `Immediate term frequency` )

joined_data <- inner_join(DF1, DF2, by = "word")

joined_data %<>% 
  mutate(logratio = log(`Immediate term frequency`/`short term frequency`)) 


joined_data %>%
  group_by(logratio < 0) %>%
  top_n(15, abs(logratio)) %>%
  ungroup() %>%
  mutate(word = reorder(word, logratio)) %>%
  ggplot(aes(word, logratio, fill = logratio < 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  ylab("Log odds ratio (Immediate Term/Short Term)") +
  scale_fill_discrete(name = "", labels = c("Immediate", "Short"))


#visualising
joined_data %>%
  group_by(logratio < 0) %>%
  top_n(15, abs(logratio)) %>%
  ungroup() %>%
  mutate(word = reorder(word, logratio)) %>%
  ggplot(aes(word, logratio, fill =logratio < 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  ylab("Log odds ratio (Immediate Term/Short Term)") +
  scale_fill_manual(name = "", labels = c("Immediate", "Short"), values=c("grey50", "grey30"))
ggsave("LogOdds_bw.png", width=9, height=6)

#JOIN 100
DF1_100 <- freq_by_rank_short_100 %>%
  select(word, `short term frequency` )

DF2_100 <- freq_by_rank_veryshort_100 %>%
  select(word, `Immediate term frequency` )

joined_data_100 <- inner_join(DF1_100, DF2_100, by = "word")

joined_data_100 %<>% 
  mutate(logratio = log(`Immediate term frequency`/`short term frequency`)) 


############################################################################
# LOG ODDS FOR TABLOIDS VERSION 2 (NON TABLOID)

#the first month
#data_later_publication=subset(data_later, select=-week)
data_later_publication=data_later


data_later_publication %<>%
  mutate(CategoryTabloid=case_when(
    Publication %in% c("The Express", "The Mirror", "Daily Star Sunday", "DAILY MAIL (London)", "The People", "Morning Star", "The Sun" ) ~ "Tabloid",
    Publication %in% c("The Guardian (London)", "Guardian.com", "The Independent (London)","The Times (London)","The Sunday Times (London)", "THE DAILY TELEGRAPH(LONDON)") ~ "Non Tabloid",
  ))

table(data_later_publication$CategoryTabloid)

word_later_publication <- data_later_publication %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words) %>%
  group_by(CategoryTabloid) %>%
  count(word, sort = TRUE)

word_later_publication %<>% 
  group_by(CategoryTabloid) %>%
  mutate(total = sum(n))

freq_by_rank_short_publication <- word_later_publication %>% 
  group_by(CategoryTabloid) %>%
  mutate(rank = row_number(), `short term frequency` = n/total) %>% 
  filter(rank<50) %>% 
  filter(CategoryTabloid!="NA")

freq_by_rank_short_100_publication <- word_later_publication %>% 
  group_by(CategoryTabloid) %>%
  mutate(rank = row_number(), `short term frequency` = n/total) %>% 
  filter(rank<100) %>% 
  filter(CategoryTabloid!="NA")



#the first week
data_now_publication=data_now

data_now_publication %<>%
  mutate(CategoryTabloid=case_when(
    Publication %in% c("The Express", "The Mirror", "Daily Star Sunday", "DAILY MAIL (London)", "The People", "Morning Star", "The Sun" ) ~ "Tabloid",
    Publication %in% c("The Guardian (London)", "Guardian.com", "The Independent (London)","The Times (London)","The Sunday Times (London)", "THE DAILY TELEGRAPH(LONDON)") ~ "Non Tabloid",
  ))

word_now_publication <- data_now_publication %>%
  unnest_tokens(word, Text) %>%
  anti_join(stop_words) %>%
  group_by(CategoryTabloid) %>%
  count(word, sort = TRUE)

word_now_publication %<>% 
  group_by(CategoryTabloid) %>%
  mutate(total = sum(n))

freq_by_rank_veryshort_publication <- word_now_publication %>% 
  group_by(CategoryTabloid) %>%
  mutate(rank = row_number(), `Immediate term frequency` = n/total)  %>% 
  filter(rank<50) %>% 
  filter(CategoryTabloid!="NA")


freq_by_rank_veryshort_100_publication <- word_now_publication %>% 
  group_by(CategoryTabloid) %>%
  mutate(rank = row_number(), `Immediate term frequency` = n/total)  %>% 
  filter(rank<100) %>% 
  filter(CategoryTabloid!="NA")


#JOIN 50


DF1 <- freq_by_rank_short_publication %>%
  select(word, CategoryTabloid, `short term frequency` )


DF2 <- freq_by_rank_veryshort_publication %>%
  select(word, CategoryTabloid, `Immediate term frequency` )

joined_data_publication <- inner_join(DF1, DF2, by = c("word", "CategoryTabloid"))

joined_data_publication %<>% 
  mutate(logratio = log(`Immediate term frequency`/`short term frequency`)) 

table(joined_data_publication$CategoryTabloid)

joined_data_publication %>%
  filter(CategoryTabloid=="Tabloid") %>%
  group_by(logratio < 0) %>%
  top_n(30, abs(logratio)) %>%
  ungroup() %>%
  mutate(word = reorder(word, logratio)) %>%
  ggplot(aes(word, logratio, fill =logratio < 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  ylab("Log odds ratio (Immediate Term/Short Term)") +
  scale_fill_manual(name = "", labels = c("Immediate", "Short"), values=c("grey50", "grey30"))
ggsave("LogOdds_Tabloid_bw_V2.png", width=9, height=6)



joined_data_publication %>%
  filter(CategoryTabloid=="Non Tabloid") %>%
  group_by(logratio < 0) %>%
  top_n(50, abs(logratio)) %>%
  ungroup() %>%
  mutate(word = reorder(word, logratio)) %>%
  ggplot(aes(word, logratio, fill =logratio < 0)) +
  geom_col(show.legend = FALSE) +
  coord_flip() +
  ylab("Log odds ratio (Immediate Term/Short Term)") +
  scale_fill_manual(name = "", labels = c("Immediate", "Short"), values=c("grey50", "grey30"))
ggsave("LogOdds_NonTabloid_bw_V2.png", width=9, height=6)
